{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 在MNIST(手写数字识别数据集)上使用PCA\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\deprecation.py:85: DeprecationWarning: Function fetch_mldata is deprecated; fetch_mldata was deprecated in version 0.20 and will be removed in version 0.22. Please use fetch_openml.\n",
      "  warnings.warn(msg, category=DeprecationWarning)\n",
      "D:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\deprecation.py:85: DeprecationWarning: Function mldata_filename is deprecated; mldata_filename was deprecated in version 0.20 and will be removed in version 0.22. Please use fetch_openml.\n",
      "  warnings.warn(msg, category=DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np \n",
    "from sklearn.datasets import fetch_mldata\n",
    "# 为了方便调试，自己去https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat 下载并放到当前目录下的mldata中\n",
    "# mnist = fetch_mldata('MNIST original') # 下载数据集可能会比较耗时，我们不用这种方法了\n",
    "mnist = fetch_mldata('MNIST original', data_home='./')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'DESCR': 'mldata.org dataset: mnist-original',\n",
       " 'COL_NAMES': ['label', 'data'],\n",
       " 'target': array([0., 0., 0., ..., 9., 9., 9.]),\n",
       " 'data': array([[0, 0, 0, ..., 0, 0, 0],\n",
       "        [0, 0, 0, ..., 0, 0, 0],\n",
       "        [0, 0, 0, ..., 0, 0, 0],\n",
       "        ...,\n",
       "        [0, 0, 0, ..., 0, 0, 0],\n",
       "        [0, 0, 0, ..., 0, 0, 0],\n",
       "        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mnist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "X, y = mnist['data'], mnist['target']\n",
    "X_train = np.array(X[:60000], dtype=float)\n",
    "y_train = np.array(y[:60000], dtype=float)\n",
    "X_test = np.array(X[60000:], dtype=float)\n",
    "y_test = np.array(y[60000:], dtype=float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(60000, 784)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape # 60000张图片，每张都是28*28的，下面显示一张看一下"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "img_tmp = X_train[0].reshape((28, 28))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPsAAAD4CAYAAAAq5pAIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAOEUlEQVR4nO3dcYwV5bnH8d8jLUalENSIG9Ha22Bym0YXQUJiU6lNG4sm0JhWiHFp2mRJLAk1jam2q5DUGxujNGoicaukWLlCFS3Y1EsNS/TemDSuSBVLW6mhdMuGFTWyxEQqPPePHZoVd95Zzpk5c+D5fpLNOWeenTOPx/0xc847c15zdwE49Z1WdwMAWoOwA0EQdiAIwg4EQdiBID7Vyo2ZGR/9AxVzdxtreVN7djO7xsz+Yma7zey2Zp4LQLWs0XF2M5sg6a+SviZpQNLLkha7+58S67BnBypWxZ59jqTd7v6Wux+WtF7SgiaeD0CFmgn7BZL+MerxQLbsY8ys28z6zay/iW0BaFIzH9CNdajwicN0d++V1CtxGA/UqZk9+4CkC0c9ni5pX3PtAKhKM2F/WdIMM/ucmU2UtEjS5nLaAlC2hg/j3f0jM1smaYukCZLWuPsbpXUGoFQND701tDHeswOVq+SkGgAnD8IOBEHYgSAIOxAEYQeCIOxAEIQdCIKwA0EQdiAIwg4EQdiBIAg7EARhB4Ig7EAQhB0IgrADQRB2IAjCDgRB2IEgCDsQBGEHgmjplM049cyaNStZX7ZsWW6tq6srue5jjz2WrD/44IPJ+vbt25P1aNizA0EQdiAIwg4EQdiBIAg7EARhB4Ig7EAQzOKKpM7OzmS9r68vWZ88eXKZ7XzM+++/n6yfc845lW27neXN4trUSTVmtkfSsKQjkj5y99nNPB+A6pRxBt1X3P1ACc8DoEK8ZweCaDbsLun3ZvaKmXWP9Qtm1m1m/WbW3+S2ADSh2cP4K919n5mdJ+l5M/uzu784+hfcvVdSr8QHdECdmtqzu/u+7HZI0jOS5pTRFIDyNRx2MzvLzD5z7L6kr0vaWVZjAMrVzGH8NEnPmNmx5/lvd/+fUrpCy8yZkz4Y27hxY7I+ZcqUZD11Hsfw8HBy3cOHDyfrRePoc+fOza0VXetetO2TUcNhd/e3JF1WYi8AKsTQGxAEYQeCIOxAEIQdCIKwA0Fwiesp4Mwzz8ytXX755cl1H3/88WR9+vTpyXo29Jor9fdVNPx1zz33JOvr169P1lO99fT0JNe9++67k/V2lneJK3t2IAjCDgRB2IEgCDsQBGEHgiDsQBCEHQiCKZtPAQ8//HBubfHixS3s5MQUnQMwadKkZP2FF15I1ufNm5dbu/TSS5PrnorYswNBEHYgCMIOBEHYgSAIOxAEYQeCIOxAEIyznwRmzZqVrF977bW5taLrzYsUjWU/++yzyfq9996bW9u3b19y3VdffTVZf++995L1q6++OrfW7OtyMmLPDgRB2IEgCDsQBGEHgiDsQBCEHQiCsANB8L3xbaCzszNZ7+vrS9YnT57c8Lafe+65ZL3oevirrroqWU9dN/7II48k13377beT9SJHjhzJrX3wwQfJdYv+u4q+875ODX9vvJmtMbMhM9s5atnZZva8mb2Z3U4ts1kA5RvPYfwvJV1z3LLbJG119xmStmaPAbSxwrC7+4uS3j1u8QJJa7P7ayUtLLkvACVr9Nz4ae4+KEnuPmhm5+X9opl1S+pucDsASlL5hTDu3iupV+IDOqBOjQ697TezDknKbofKawlAFRoN+2ZJS7L7SyRtKqcdAFUpHGc3syckzZN0rqT9klZI+o2kX0u6SNJeSd9y9+M/xBvruUIexl9yySXJ+ooVK5L1RYsWJesHDhzIrQ0ODibXveuuu5L1p556KllvZ6lx9qK/+w0bNiTrN954Y0M9tULeOHvhe3Z3zzur4qtNdQSgpThdFgiCsANBEHYgCMIOBEHYgSD4KukSnH766cl66uuUJWn+/PnJ+vDwcLLe1dWVW+vv70+ue8YZZyTrUV100UV1t1A69uxAEIQdCIKwA0EQdiAIwg4EQdiBIAg7EATj7CWYOXNmsl40jl5kwYIFyXrRtMqAxJ4dCIOwA0EQdiAIwg4EQdiBIAg7EARhB4JgnL0Eq1atStbNxvxm338rGidnHL0xp52Wvy87evRoCztpD+zZgSAIOxAEYQeCIOxAEIQdCIKwA0EQdiAIxtnH6brrrsutdXZ2Jtctmh548+bNDfWEtNRYetH/kx07dpTdTu0K9+xmtsbMhsxs56hlK83sn2a2I/tp7tsZAFRuPIfxv5R0zRjLf+7undnP78ptC0DZCsPu7i9KercFvQCoUDMf0C0zs9eyw/ypeb9kZt1m1m9m6UnHAFSq0bCvlvR5SZ2SBiXdl/eL7t7r7rPdfXaD2wJQgobC7u773f2Iux+V9AtJc8ptC0DZGgq7mXWMevhNSTvzfhdAeygcZzezJyTNk3SumQ1IWiFpnpl1SnJJeyQtrbDHtpCax3zixInJdYeGhpL1DRs2NNTTqa5o3vuVK1c2/Nx9fX3J+u23397wc7erwrC7++IxFj9aQS8AKsTpskAQhB0IgrADQRB2IAjCDgTBJa4t8OGHHybrg4ODLeqkvRQNrfX09CTrt956a7I+MDCQW7vvvtyTPiVJhw4dStZPRuzZgSAIOxAEYQeCIOxAEIQdCIKwA0EQdiAIxtlbIPJXRae+ZrtonPyGG25I1jdt2pSsX3/99cl6NOzZgSAIOxAEYQeCIOxAEIQdCIKwA0EQdiAIxtnHycwaqknSwoULk/Xly5c31FM7uOWWW5L1O+64I7c2ZcqU5Lrr1q1L1ru6upJ1fBx7diAIwg4EQdiBIAg7EARhB4Ig7EAQhB0IgnH2cXL3hmqSdP755yfrDzzwQLK+Zs2aZP2dd97Jrc2dOze57k033ZSsX3bZZcn69OnTk/W9e/fm1rZs2ZJc96GHHkrWcWIK9+xmdqGZbTOzXWb2hpktz5afbWbPm9mb2e3U6tsF0KjxHMZ/JOmH7v6fkuZK+r6ZfUHSbZK2uvsMSVuzxwDaVGHY3X3Q3bdn94cl7ZJ0gaQFktZmv7ZWUvqcUAC1OqH37GZ2saSZkv4gaZq7D0oj/yCY2Xk563RL6m6uTQDNGnfYzWySpI2SfuDuB4su/jjG3Xsl9WbPkf4kC0BlxjX0Zmaf1kjQ17n709ni/WbWkdU7JA1V0yKAMhTu2W1kF/6opF3uvmpUabOkJZJ+lt2mv9c3sAkTJiTrN998c7Je9JXIBw8ezK3NmDEjuW6zXnrppWR927ZtubU777yz7HaQMJ7D+Csl3STpdTPbkS37sUZC/msz+56kvZK+VU2LAMpQGHZ3/z9JeW/Qv1puOwCqwumyQBCEHQiCsANBEHYgCMIOBGFFl2eWurGT+Ay61KWcTz75ZHLdK664oqltF52t2Mz/w9TlsZK0fv36ZP1k/hrsU5W7j/kHw54dCIKwA0EQdiAIwg4EQdiBIAg7EARhB4JgnL0EHR0dyfrSpUuT9Z6enmS9mXH2+++/P7nu6tWrk/Xdu3cn62g/jLMDwRF2IAjCDgRB2IEgCDsQBGEHgiDsQBCMswOnGMbZgeAIOxAEYQeCIOxAEIQdCIKwA0EQdiCIwrCb2YVmts3MdpnZG2a2PFu+0sz+aWY7sp/51bcLoFGFJ9WYWYekDnffbmafkfSKpIWSvi3pkLvfO+6NcVINULm8k2rGMz/7oKTB7P6wme2SdEG57QGo2gm9ZzeziyXNlPSHbNEyM3vNzNaY2dScdbrNrN/M+pvqFEBTxn1uvJlNkvSCpP9y96fNbJqkA5Jc0k81cqj/3YLn4DAeqFjeYfy4wm5mn5b0W0lb3H3VGPWLJf3W3b9Y8DyEHahYwxfC2MhXmz4qadfooGcf3B3zTUk7m20SQHXG82n8lyT9r6TXJR3NFv9Y0mJJnRo5jN8jaWn2YV7qudizAxVr6jC+LIQdqB7XswPBEXYgCMIOBEHYgSAIOxAEYQeCIOxAEIQdCIKwA0EQdiAIwg4EQdiBIAg7EARhB4Io/MLJkh2Q9PdRj8/NlrWjdu2tXfuS6K1RZfb22bxCS69n/8TGzfrdfXZtDSS0a2/t2pdEb41qVW8cxgNBEHYgiLrD3lvz9lPatbd27Uuit0a1pLda37MDaJ269+wAWoSwA0HUEnYzu8bM/mJmu83stjp6yGNme8zs9Wwa6lrnp8vm0Bsys52jlp1tZs+b2ZvZ7Zhz7NXUW1tM452YZrzW167u6c9b/p7dzCZI+qukr0kakPSypMXu/qeWNpLDzPZImu3utZ+AYWZflnRI0mPHptYys3skvevuP8v+oZzq7j9qk95W6gSn8a6ot7xpxr+jGl+7Mqc/b0Qde/Y5kna7+1vufljSekkLauij7bn7i5LePW7xAklrs/trNfLH0nI5vbUFdx909+3Z/WFJx6YZr/W1S/TVEnWE/QJJ/xj1eEDtNd+7S/q9mb1iZt11NzOGacem2cpuz6u5n+MVTuPdSsdNM942r10j0583q46wjzU1TTuN/13p7pdL+oak72eHqxif1ZI+r5E5AAcl3VdnM9k04xsl/cDdD9bZy2hj9NWS162OsA9IunDU4+mS9tXQx5jcfV92OyTpGY287Wgn+4/NoJvdDtXcz7+5+353P+LuRyX9QjW+dtk04xslrXP3p7PFtb92Y/XVqtetjrC/LGmGmX3OzCZKWiRpcw19fIKZnZV9cCIzO0vS19V+U1FvlrQku79E0qYae/mYdpnGO2+acdX82tU+/bm7t/xH0nyNfCL/N0k/qaOHnL7+Q9Ifs5836u5N0hMaOaz7l0aOiL4n6RxJWyW9md2e3Ua9/UojU3u/ppFgddTU25c08tbwNUk7sp/5db92ib5a8rpxuiwQBGfQAUEQdiAIwg4EQdiBIAg7EARhB4Ig7EAQ/w8+sGPVrnT8WgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "plt.imshow(img_tmp, cmap='gray') # 默认是彩色\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(60000,)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000, 784)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000,)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 使用kNN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 19.9 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n",
       "                     weights='uniform')"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "\n",
    "knn_clf = KNeighborsClassifier()\n",
    "%time knn_clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 9min 35s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.9688"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time knn_clf.score(X_test, y_test) # 这里是对所有数据进行训练，所以耗时较长"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PCA进行降维"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "pca = PCA(0.90) # 取90%有效数据对应的维度即可\n",
    "pca.fit(X_train)\n",
    "X_train_reduction = pca.transform(X_train)\n",
    "X_test_reduction = pca.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(60000, 87)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_reduction.shape # 想保留90%的数据只需要降维到87维即可(即图像变成从784列变成87列)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 降维有时去除了噪音，有可能准确率更高！"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 340 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n",
       "                     weights='uniform')"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "knn_clf = KNeighborsClassifier()\n",
    "%time knn_clf.fit(X_train_reduction, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 58.1 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.9728"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time knn_clf.score(X_test_reduction, y_test) # 降维后速度明显加快了,而且准确率还提高了，可以看出降维过程中还去除了部分噪声"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
